mkdir -p results
# Make the blastdbs - they are output in same folder as the fasta input
conda activate MCHelper
while IFS=',' read -r species strain marta dean; do
makeblastdb -in ${marta} -dbtype "nucl"
done < <(sed '1d' curated_libs.csv)
# Run BLASTS - reciprocal not needed
while IFS=',' read -r species strain marta dean; do
blastn -query ${dean} -db ${marta} -outfmt 6 -max_hsps 1 -out results/${species}_${strain}_dean_vs_marta.blast.out
done < <(sed '1d' curated_libs.csv)Manual curation trials
Comparing results of different TEammo curators
Run BLASTs between curated libraries
Visualisation
if(!require("BiocManager", character.only = T)) install.packages("BiocManager")
pkgs.list <- readLines("r-requirements.txt")
for (i in 1:length(pkgs.list)){
if(!require(pkgs.list[i], character.only = T)){
BiocManager::install(pkgs.list[i], Ncpus = ceiling(parallel::detectCores()/1.7))
require(pkgs.list[i], character.only = TRUE)
}else{require(pkgs.list[1],character.only = TRUE)}
}Load and overall comparisons
setwd("/home/csic/gcy/jgp/extra_storage/dean/mctrials/mctrials")
source("scripts/mccompare_functions.R")
# Color schemes
palette_seq_match <- c(
"Missing from query lib" = "black", # Clear and bold
"Missing from subject lib" = "grey50", # Distinct but neutral
"Present, 70" = RColorBrewer::brewer.pal(3, "Blues")[2], # Medium blue
"Present, 80" = RColorBrewer::brewer.pal(3, "Blues")[3], # Darker blue
"Perfect, 95-100" = RColorBrewer::brewer.pal(3, "Greens")[3] # Bright green
)
# Load the TE classifications
teClassification <- read.table("orozco_classification-2024_mchelper.csv", sep = ";", header = TRUE)
teClassification <- teClassification %>%
mutate(AppName = paste(Class, Order, Superfamily, sep ="/") %>% sub("/$", "", .) %>% sub("/$", "", .))
teClassification <- rbind(
teClassification
, c("", "Unclassified", "", "Unclassified")
)
# Load libraries to compare
# Drosophila birchii
lib_Dbir_marta <- teReadLib(
"../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
libIdentifier = "Marta",
species = "D.birchii",
strain = "v1.0_00_GenBank")
lib_Dbir_dean <- teReadLib(
"data/MCH_output/D.birchii/v1.0_00_GenBank/D.birchii_v1.0_00_GenBank_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.birchii",
strain = "v1.0_00_GenBank")
# Drosophila merina
lib_Dmer_marta <- teReadLib(
"../../data/final_libraries/D.merina/D.merina.TE_library.fasta",
libIdentifier = "Marta",
species = "D.merina", strain = "NA")
lib_Dmer_dean <- teReadLib(
"data/MCH_output/D.merina/NA/D.merina_NA_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.merina", strain = "NA")
# Drosophila santomea
lib_Dsan_marta <- teReadLib(
"../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
libIdentifier = "Marta",
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq")
lib_Dsan_dean <- teReadLib(
"data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/D.santomea_STO_CAGO_1482_RefSeq_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq")
# Drosophila tristis
lib_Dtri_marta <- teReadLib(
"../../data/final_libraries/D.tristis/D.tristis.TE_library.fasta",
libIdentifier = "Marta",
species = "D.tristis", strain = "nanopore_D2")
lib_Dtri_dean <- teReadLib(
"data/MCH_output/D.tristis/nanopore_D2/D.tristis_nanopore_D2_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.tristis", strain = "nanopore_D2")# Load the blast results comparing curated libraries
blast_Dbir_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.birchii_v1.0_00_GenBank_dean_vs_marta.blast.out",
blast_query_lib = lib_Dbir_dean,
blast_subject_lib = lib_Dbir_marta,
species = "D.birchii",
strain = "v1.0_00_GenBank",
comparison = "dean_vs_marta")
blast_Dmer_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.merina_NA_dean_vs_marta.blast.out",
blast_query_lib = lib_Dmer_dean,
blast_subject_lib = lib_Dmer_marta,
species = "D.merina",
strain = "NA",
comparison = "dean_vs_marta")
blast_Dsan_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.santomea_STO_CAGO_1482_RefSeq_dean_vs_marta.blast.out",
blast_query_lib = lib_Dsan_dean,
blast_subject_lib = lib_Dsan_marta,
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq",
comparison = "dean_vs_marta")
blast_Dtri_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.tristis_nanopore_D2_dean_vs_marta.blast.out",
blast_query_lib = lib_Dtri_dean,
blast_subject_lib = lib_Dtri_marta,
species = "D.tristis",
strain = "nanopore_D2",
comparison = "dean_vs_marta")
# Bring them all together for overall
blast_all <- bind_rows(
blast_Dbir_dean_vs_marta,
blast_Dmer_dean_vs_marta,
blast_Dsan_dean_vs_marta,
blast_Dtri_dean_vs_marta
)How do the libraries compare overall?
tePlotLib(list(lib_Dbir_dean, lib_Dbir_marta))tePlotLib(list(lib_Dmer_dean, lib_Dmer_marta))tePlotLib(list(lib_Dsan_dean, lib_Dsan_marta))tePlotLib(list(lib_Dtri_dean, lib_Dtri_marta))How often do the curators agree based on BLASTn sequence identity between their final curated libraries?
PlotBlastBarMatches(blast_all)PlotBlastBarMatches(blast_Dbir_dean_vs_marta)PlotBlastBarMatches(blast_Dmer_dean_vs_marta)PlotBlastBarMatches(blast_Dsan_dean_vs_marta)PlotBlastBarMatches(blast_Dtri_dean_vs_marta)How often do curators agree on TE classifications?
PlotBlastTileMatches(blast_all)PlotBlastTileMatches(blast_Dbir_dean_vs_marta)PlotBlastTileMatches(blast_Dmer_dean_vs_marta)PlotBlastTileMatches(blast_Dsan_dean_vs_marta)PlotBlastTileMatches(blast_Dtri_dean_vs_marta)TE size distribution
lib1 <- width(lib_Dtri_dean) %>%
data.frame(length = .)
lib1$curator <- "dean"
lib2 <- width(lib_Dtri_marta) %>%
data.frame(length = .)
lib2$curator <- "marta"
bind_rows(lib1, lib2) %>%
ggplot(aes(x = length)) +
geom_density(aes(fill = curator), alpha = 0.5, color = "black") +
labs(title = "Density Plot of DNA Sequence Lengths",
x = "Sequence Length (bp)",
y = "Density") +
theme_minimal()